#coding:utf-8
'''##########################
#	Project:Tianya_Only_Lz	#
#	Author:36901@QQ.COM		#
#	Date:2013-03-02			#
##########################'''
import urllib2,re,socket,os,sys,chardet
from bs4 import BeautifulSoup
from urllib import unquote
def start(url):
	soup=get_html(url)
	title = re.sub('_.*','',soup.title.string.encode('utf-8'))
	result= '\n=============%s===========\n'%(title)
	cat,tid = url.split('-')[1],url.split('-')[2]
	print cat,tid
	try:
		form_att = soup.find('form',{'action':''})['onsubmit'].encode('utf-8').split(',')
		pages=re.search('(\d+)',form_att[3]).group(0)
		for i in xrange(1,int(pages)+1):
			url='http://bbs.tianya.cn/post-%s-%s-%s.shtml'%(cat,tid,i)
			txt = start_collect(url)
			if txt:print u'第%s页已抓取完成'%(i)
			else:print u'第%s页无楼主回复'%(i)
			result += txt
	except:
		result += start_collect(url)
	save_result(cat,tid,result)

def start_collect(url):
	soup=get_html(url)
	txt=[]
	lz_name = soup.find('div',{'class':'atl-menu clearfix js-bbs-act'})['js_activityusername'].encode('utf-8')
	lz_posts = soup.findAll('div',{'class':'atl-item','_host':lz_name})#LZ回复标志
	for i in xrange(len(lz_posts)):
		post = lz_posts[i].find('div',{'class':'atl-content'}).text.encode('utf-8')
		if unquote(lz_name) in post:
			post = re.sub('%s.*'%(unquote(lz_name)),'',post)
		else:pass
		txt.append(re.sub('<div.*</div>','',post))
	content = '\n==============\n'.join(txt)
	print content#.decode('utf-8')
	return content

def get_html(url):
	html=urllib2.urlopen(url,timeout=10).read().decode('utf-8')
	soup=BeautifulSoup(html)
	return soup

def save_result(cat,tid,result):
	local_path = os.getcwd()
	file_name = '%s/%s_%s.txt'%(local_path,cat,tid)
	file=open(file_name,'w')
	file.write(result)
	file.close()

if __name__ == '__main__':
	if len(sys.argv) > 1 and 'tianya' in sys.argv[1]:start(sys.argv[1])
	else:print u'请输入正确网址。example:tianya.py http://bbs.tianya.cn/post-414-46337-1.shtml'
	
